import numpy as np
import pandas as pd
import dask.dataframe as dd
from IPython.display import display
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
%%time
df = dd.read_parquet(r"C:\Users\prath\OneDrive\Desktop\microsoft_malware_prediction\Data Exploration\train.parquet")
df = df.compute()
CPU times: total: 22.7 s Wall time: 1min 38s
print(df.shape)
(8921483, 83)
display(df.head())
| MachineIdentifier | ProductName | EngineVersion | AppVersion | AvSigVersion | IsBeta | RtpStateBitfield | IsSxsPassiveMode | DefaultBrowsersIdentifier | AVProductStatesIdentifier | ... | Census_FirmwareVersionIdentifier | Census_IsSecureBootEnabled | Census_IsWIMBootEnabled | Census_IsVirtualDevice | Census_IsTouchEnabled | Census_IsPenCapable | Census_IsAlwaysOnAlwaysConnectedCapable | Wdft_IsGamer | Wdft_RegionIdentifier | HasDetections | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000028988387b115f69f31a3bf04f09 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1735.0 | 0 | 7.0 | 0 | NaN | 53447.0 | ... | 36144.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 10.0 | 0 |
| 1 | 000007535c3f730efa9ea0b7ef1bd645 | win8defender | 1.1.14600.4 | 4.13.17134.1 | 1.263.48.0 | 0 | 7.0 | 0 | NaN | 53447.0 | ... | 57858.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 8.0 | 0 |
| 2 | 000007905a28d863f6d0d597892cd692 | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1341.0 | 0 | 7.0 | 0 | NaN | 53447.0 | ... | 52682.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 3.0 | 0 |
| 3 | 00000b11598a75ea8ba1beea8459149f | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1527.0 | 0 | 7.0 | 0 | NaN | 53447.0 | ... | 20050.0 | 0 | NaN | 0.0 | 0 | 0 | 0.0 | 0.0 | 3.0 | 1 |
| 4 | 000014a5f00daa18e76b81417eeb99fc | win8defender | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1379.0 | 0 | 7.0 | 0 | NaN | 53447.0 | ... | 19844.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0.0 | 0.0 | 1.0 | 1 |
5 rows × 83 columns
fig = px.bar(df["HasDetections"].value_counts().reset_index(),
x="index", y="HasDetections", color="index", title="Has Detection(target)")
fig.show()
df["HasDetections"].value_counts()
0 4462591 1 4458892 Name: HasDetections, dtype: int64
%%time
stats = []
for col in df.columns:
stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)
CPU times: total: 14 s Wall time: 16.4 s
| Feature | Unique_values | Percentage of missing values | Percentage of values in the biggest category | type | |
|---|---|---|---|---|---|
| 28 | PuaMode | 2 | 99.974119 | 99.974119 | category |
| 41 | Census_ProcessorClass | 3 | 99.589407 | 99.589407 | category |
| 8 | DefaultBrowsersIdentifier | 2017 | 95.141637 | 95.141637 | float32 |
| 68 | Census_IsFlightingInternal | 2 | 83.044030 | 83.044030 | float32 |
| 52 | Census_InternalBatteryType | 78 | 71.046809 | 71.046809 | category |
| ... | ... | ... | ... | ... | ... |
| 1 | ProductName | 6 | 0.000000 | 98.935569 | category |
| 45 | Census_HasOpticalDiskDrive | 2 | 0.000000 | 92.281272 | int8 |
| 54 | Census_OSVersion | 469 | 0.000000 | 15.845202 | category |
| 55 | Census_OSArchitecture | 3 | 0.000000 | 90.858045 | category |
| 82 | HasDetections | 2 | 0.000000 | 50.020731 | int8 |
83 rows × 5 columns
stats_df = stats_df.astype({'type': 'string'})
table_data = stats_df.head(50)
fig = go.Figure(data=[go.Table(
header=dict(values=list(table_data.columns),
fill_color='paleturquoise',
align='left',
font=dict(color='black', size=12)),
cells=dict(values=[table_data[column] for column in table_data.columns],
fill_color='lavender',
align='left',
font=dict(color='black', size=11)))
])
fig.show()
train= pd.DataFrame(df)
good_cols = list(df.columns)
for col in df.columns:
rate = df[col].value_counts(normalize=True, dropna=False).values[0]
if rate > 0.9: # Removing Missing Values if
good_cols.remove(col)
train= df[good_cols]
train.head()
| MachineIdentifier | EngineVersion | AppVersion | AvSigVersion | AVProductStatesIdentifier | AVProductsInstalled | CountryIdentifier | CityIdentifier | OrganizationIdentifier | GeoNameIdentifier | ... | Census_IsFlightingInternal | Census_ThresholdOptIn | Census_FirmwareManufacturerIdentifier | Census_FirmwareVersionIdentifier | Census_IsSecureBootEnabled | Census_IsWIMBootEnabled | Census_IsTouchEnabled | Wdft_IsGamer | Wdft_RegionIdentifier | HasDetections | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000028988387b115f69f31a3bf04f09 | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1735.0 | 53447.0 | 1.0 | 29 | 128035.0 | 18.0 | 35.0 | ... | NaN | NaN | 628.0 | 36144.0 | 0 | NaN | 0 | 0.0 | 10.0 | 0 |
| 1 | 000007535c3f730efa9ea0b7ef1bd645 | 1.1.14600.4 | 4.13.17134.1 | 1.263.48.0 | 53447.0 | 1.0 | 93 | 1482.0 | 18.0 | 119.0 | ... | NaN | NaN | 628.0 | 57858.0 | 0 | NaN | 0 | 0.0 | 8.0 | 0 |
| 2 | 000007905a28d863f6d0d597892cd692 | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1341.0 | 53447.0 | 1.0 | 86 | 153579.0 | 18.0 | 64.0 | ... | NaN | NaN | 142.0 | 52682.0 | 0 | NaN | 0 | 0.0 | 3.0 | 0 |
| 3 | 00000b11598a75ea8ba1beea8459149f | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1527.0 | 53447.0 | 1.0 | 88 | 20710.0 | NaN | 117.0 | ... | NaN | NaN | 355.0 | 20050.0 | 0 | NaN | 0 | 0.0 | 3.0 | 1 |
| 4 | 000014a5f00daa18e76b81417eeb99fc | 1.1.15100.1 | 4.18.1807.18075 | 1.273.1379.0 | 53447.0 | 1.0 | 18 | 37376.0 | NaN | 277.0 | ... | 0.0 | 0.0 | 355.0 | 19844.0 | 0 | 0.0 | 0 | 0.0 | 1.0 | 1 |
5 rows × 57 columns
def plot_categorical_feature(col, only_bars=False, top_n=10, by_touch=False):
top_n = top_n if train[col].nunique() > top_n else train[col].nunique()
print(f"{col} has {train[col].nunique()} unique values and type: {train[col].dtype}.")
print(train[col].value_counts(normalize=True, dropna=False).head())
if not by_touch:
if not only_bars:
df = train.groupby([col]).agg({'HasDetections': ['count', 'mean']})
df = df.sort_values(('HasDetections', 'count'), ascending=False).head(top_n).sort_index()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Bar(x=df.index, y=df['HasDetections']['count'].values, name='counts'), secondary_y=False)
fig.add_trace(go.Scatter(x=df.index, y=df['HasDetections']['mean'], name='Detections rate'), secondary_y=True)
fig.update_layout(title=f"Counts of {col} by top-{top_n} categories and mean target value",
xaxis=dict(title=f'{col}',
showgrid=False,
zeroline=False,
showline=False,),
yaxis=dict(title='Counts',
showgrid=False,
zeroline=False,
showline=False,),
yaxis2=dict(title='Detections rate', overlaying='y', side='right'),
legend=dict(orientation="v"))
else:
top_cat = list(train[col].value_counts(dropna=False).index[:top_n])
df0 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 1), col].value_counts().head(10).sort_index()
df1 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 0), col].value_counts().head(10).sort_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=df0.index, y=df0.values, name='Has Detections'))
fig.add_trace(go.Bar(x=df1.index, y=df1.values, name='No Detections'))
fig.update_layout(title=f"Counts of {col} by top-{top_n} categories",
xaxis=dict(title=f'{col}',
showgrid=False,
zeroline=False,
showline=False,),
yaxis=dict(title='Counts',
showgrid=False,
zeroline=False,
showline=False,),
legend=dict(orientation="v"), barmode='group')
fig.show()
else:
top_n = 10
top_cat = list(train[col].value_counts(dropna=False).index[:top_n])
df = train.loc[train[col].isin(top_cat)]
df1 = train.loc[train['Census_IsTouchEnabled'] == 1]
df0 = train.loc[train['Census_IsTouchEnabled'] == 0]
df0_ = df0.groupby([col]).agg({'HasDetections': ['count', 'mean']})
df0_ = df0_.sort_values(('HasDetections', 'count'), ascending=False).head(top_n).sort_index()
df1_ = df1.groupby([col]).agg({'HasDetections': ['count', 'mean']})
df1_ = df1_.loc[df0_.index]
fig = go.Figure()
fig.add_trace(go.Bar(
x=df0_[('HasDetections', 'mean')], y=df0_.index,
orientation='h', name='Census_IsTouchEnabled=0', marker=dict(color='royalblue')
))
fig.add_trace(go.Bar(
x=df1_[('HasDetections', 'mean')], y=df1_.index,
orientation='h', name='Census_IsTouchEnabled=1', marker=dict(color='seagreen')
))
fig.update_layout(
title=f'Distribution of HasDetections by {col}',
xaxis_title='% of HasDetections',
yaxis_title=col,
legend=dict(x=0.85, y=0.95),
margin=dict(l=120, r=20, t=50, b=50),
height=600
)
fig.show()
plot_categorical_feature('Census_IsTouchEnabled', True)
Census_IsTouchEnabled has 2 unique values and type: int8. 0 0.874457 1 0.125543 Name: Census_IsTouchEnabled, dtype: float64
plot_categorical_feature('EngineVersion', by_touch=True)
EngineVersion has 70 unique values and type: category. 1.1.15200.1 0.430990 1.1.15100.1 0.412030 1.1.15000.2 0.029728 1.1.14901.4 0.023809 1.1.14600.4 0.018000 Name: EngineVersion, dtype: float64
plot_categorical_feature('AppVersion')
AppVersion has 110 unique values and type: category. 4.18.1807.18075 0.576050 4.18.1806.18062 0.095380 4.12.16299.15 0.040338 4.10.209.0 0.030539 4.13.17134.1 0.028837 Name: AppVersion, dtype: float64
plot_categorical_feature('AvSigVersion')
AvSigVersion has 8531 unique values and type: category. 1.273.1420.0 0.011469 1.263.48.0 0.010987 1.275.1140.0 0.010899 1.275.727.0 0.010362 1.273.371.0 0.009748 Name: AvSigVersion, dtype: float64
plot_categorical_feature('AVProductStatesIdentifier',True,10)
AVProductStatesIdentifier has 28970 unique values and type: float32. 53447.0 0.652870 7945.0 0.053343 47238.0 0.036727 62773.0 0.029901 46413.0 0.012652 Name: AVProductStatesIdentifier, dtype: float64
plot_categorical_feature('AVProductsInstalled',True)
AVProductsInstalled has 8 unique values and type: float32. 1.0 0.695949 2.0 0.275628 3.0 0.023326 NaN 0.004060 4.0 0.000982 Name: AVProductsInstalled, dtype: float64
plot_categorical_feature('CountryIdentifier',True,20)
CountryIdentifier has 222 unique values and type: int16. 43 0.044519 29 0.039006 141 0.037372 93 0.031791 171 0.031449 Name: CountryIdentifier, dtype: float64
plot_categorical_feature('CityIdentifier',True,20)
CityIdentifier has 107366 unique values and type: float32. NaN 0.036475 130775.0 0.010627 16668.0 0.009503 82373.0 0.009338 10222.0 0.008050 Name: CityIdentifier, dtype: float64
plot_categorical_feature('OrganizationIdentifier', True, by_touch=True)
OrganizationIdentifier has 49 unique values and type: float32. 27.0 0.470377 NaN 0.308415 18.0 0.197745 48.0 0.007156 50.0 0.005100 Name: OrganizationIdentifier, dtype: float64
plot_categorical_feature('GeoNameIdentifier',True)
GeoNameIdentifier has 292 unique values and type: float32. 277.0 0.171712 211.0 0.047432 53.0 0.045823 89.0 0.040441 240.0 0.038846 Name: GeoNameIdentifier, dtype: float64
plot_categorical_feature('LocaleEnglishNameIdentifier', True)
LocaleEnglishNameIdentifier has 276 unique values and type: int16. 75 0.234780 182 0.050450 74 0.046075 42 0.045913 88 0.042058 Name: LocaleEnglishNameIdentifier, dtype: float64
plot_categorical_feature('OsPlatformSubRelease', True, by_touch=True)
OsPlatformSubRelease has 9 unique values and type: category. rs4 0.438887 rs3 0.280635 rs2 0.087460 rs1 0.081917 th2 0.046136 Name: OsPlatformSubRelease, dtype: float64
plot_categorical_feature('OsBuildLab', True)
OsBuildLab has 663 unique values and type: category. 17134.1.amd64fre.rs4_release.180410-1804 0.410044 16299.431.amd64fre.rs3_release_svc_escrow.180502-1908 0.140411 16299.15.amd64fre.rs3_release.170928-1534 0.107724 15063.0.amd64fre.rs2_release.170317-1834 0.080484 17134.1.x86fre.rs4_release.180410-1804 0.028815 Name: OsBuildLab, dtype: float64
plot_categorical_feature('IeVerIdentifier', True)
IeVerIdentifier has 303 unique values and type: float32. 137.0 0.435560 117.0 0.198166 108.0 0.053174 111.0 0.052438 98.0 0.039726 Name: IeVerIdentifier, dtype: float64
plot_categorical_feature('Census_OEMNameIdentifier', True)
Census_OEMNameIdentifier has 3832 unique values and type: float32. 2668.0 0.144289 2102.0 0.116412 1443.0 0.106432 2206.0 0.103609 585.0 0.100370 Name: Census_OEMNameIdentifier, dtype: float64
plot_categorical_feature('Census_ProcessorCoreCount',True,by_touch=True)
Census_ProcessorCoreCount has 45 unique values and type: float32. 4.0 0.608665 2.0 0.259146 8.0 0.096957 12.0 0.010391 1.0 0.007890 Name: Census_ProcessorCoreCount, dtype: float64
plot_categorical_feature('Census_ProcessorModelIdentifier', True)
Census_ProcessorModelIdentifier has 3428 unique values and type: float32. 2697.0 0.032425 1998.0 0.029972 2660.0 0.021453 2373.0 0.019661 1992.0 0.019249 Name: Census_ProcessorModelIdentifier, dtype: float64
plot_categorical_feature('Census_PrimaryDiskTotalCapacity', True)
Census_PrimaryDiskTotalCapacity has 5735 unique values and type: float64. 476940.0 0.318504 953869.0 0.243881 305245.0 0.053199 122104.0 0.052576 244198.0 0.050696 Name: Census_PrimaryDiskTotalCapacity, dtype: float64
plot_categorical_feature('Census_TotalPhysicalRAM', True,by_touch=True)
Census_TotalPhysicalRAM has 3446 unique values and type: float32. 4096.0 0.458950 8192.0 0.246204 2048.0 0.123015 16384.0 0.059582 6144.0 0.044687 Name: Census_TotalPhysicalRAM, dtype: float64